R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

Loading Libraries

## Warning: package 'tidyverse' was built under R version 4.0.2
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.2
## Warning: package 'tidyr' was built under R version 4.0.2
## Warning: package 'readr' was built under R version 4.0.2
## Warning: package 'purrr' was built under R version 4.0.2
## Warning: package 'dplyr' was built under R version 4.0.2
## Warning: package 'stringr' was built under R version 4.0.2
## Warning: package 'forcats' was built under R version 4.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## Warning: package 'xgboost' was built under R version 4.0.2
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
## Warning: package 'caret' was built under R version 4.0.2
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
## Warning: package 'VIM' was built under R version 4.0.2
## Loading required package: colorspace
## Warning: package 'colorspace' was built under R version 4.0.2
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
## corrplot 0.90 loaded
## Warning: package 'ggthemes' was built under R version 4.0.2
## Warning: package 'plotly' was built under R version 4.0.2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:xgboost':
## 
##     slice
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
## Warning: package 'factoextra' was built under R version 4.0.2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
## Warning: package 'Rtsne' was built under R version 4.0.2
## Warning: package 'breakDown' was built under R version 4.0.2
## 
## Attaching package: 'breakDown'
## The following object is masked from 'package:VIM':
## 
##     wine
## Warning: package 'gridExtra' was built under R version 4.0.2
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
## Warning: package 'lightgbm' was built under R version 4.0.2
## Loading required package: R6
## Warning: package 'R6' was built under R version 4.0.2
## 
## Attaching package: 'lightgbm'
## The following object is masked from 'package:plotly':
## 
##     slice
## The following objects are masked from 'package:xgboost':
## 
##     getinfo, setinfo, slice
## The following object is masked from 'package:dplyr':
## 
##     slice
## Warning: package 'glmnet' was built under R version 4.0.2
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.1-2

Importing Data

train = read_csv("./Data/train.csv")
## Rows: 9557 Columns: 143
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (5): Id, idhogar, dependency, edjefe, edjefa
## dbl (138): v2a1, hacdor, rooms, hacapo, v14a, refrig, v18q, v18q1, r4h1, r4h...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
test = read_csv("./Data/test.csv")
## Rows: 23856 Columns: 142
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (5): Id, idhogar, dependency, edjefe, edjefa
## dbl (137): v2a1, hacdor, rooms, hacapo, v14a, refrig, v18q, v18q1, r4h1, r4h...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cat("Dimensions for training data : ", dim(train))
## Dimensions for training data :  9557 143
cat("Dimensions for testing data: ", dim(test))
## Dimensions for testing data:  23856 142
head(train)
## # A tibble: 6 × 143
##   Id         v2a1 hacdor rooms hacapo  v14a refrig  v18q v18q1  r4h1  r4h2  r4h3
##   <chr>     <dbl>  <dbl> <dbl>  <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ID_2796… 190000      0     3      0     1      1     0    NA     0     1     1
## 2 ID_f29e… 135000      0     4      0     1      1     1     1     0     1     1
## 3 ID_68de…     NA      0     8      0     1      1     0    NA     0     0     0
## 4 ID_d671… 180000      0     5      0     1      1     1     1     0     2     2
## 5 ID_d56d… 180000      0     5      0     1      1     1     1     0     2     2
## 6 ID_ec05… 180000      0     5      0     1      1     1     1     0     2     2
## # … with 131 more variables: r4m1 <dbl>, r4m2 <dbl>, r4m3 <dbl>, r4t1 <dbl>,
## #   r4t2 <dbl>, r4t3 <dbl>, tamhog <dbl>, tamviv <dbl>, escolari <dbl>,
## #   rez_esc <dbl>, hhsize <dbl>, paredblolad <dbl>, paredzocalo <dbl>,
## #   paredpreb <dbl>, pareddes <dbl>, paredmad <dbl>, paredzinc <dbl>,
## #   paredfibras <dbl>, paredother <dbl>, pisomoscer <dbl>, pisocemento <dbl>,
## #   pisoother <dbl>, pisonatur <dbl>, pisonotiene <dbl>, pisomadera <dbl>,
## #   techozinc <dbl>, techoentrepiso <dbl>, techocane <dbl>, techootro <dbl>, …

Encoding categorical features (Already one-hot encoded, reversing present encoding, adding new integer encoding)

One hot encoding is one method of converting data to prepare it for an algorithm and get a better prediction. With one-hot, we convert each categorical value into a new categorical column and assign a binary value of 1 or 0 to those columns. Each integer value is represented as a binary vector.

Cycle through and reverse the OHE process for these

##   pared_int piso_int techo_int abasta_int sanitario_int energcocinar_int
## 1         1        3         2          1             2                3
## 2         4        2         4          1             2                2
## 3         4        3         4          1             2                2
## 4         1        3         4          1             2                2
## 5         1        3         4          1             2                2
## 6         1        3         4          1             2                2
##   elimbasu_int epared_int etecho_int eviv_int estadocivil_int parentesco_int
## 1            1          2          1        1               4              1
## 2            1          2          2        2               4              1
## 3            1          2          3        3               6              1
## 4            1          3          3        3               7              6
## 5            1          3          3        3               2              5
## 6            1          3          3        3               2              1
##   instlevel_int tipovivi_int lugar_int area_int
## 1             4            3         1        1
## 2             8            3         1        1
## 3             5            1         1        1
## 4             4            3         1        1
## 5             5            3         1        1
## 6             5            3         1        1
##             Id   v2a1 hacdor rooms hacapo v14a refrig v18q v18q1 r4h1 r4h2 r4h3
## 1 ID_279628684 190000      0     3      0    1      1    0    NA    0    1    1
## 2 ID_f29eb3ddd 135000      0     4      0    1      1    1     1    0    1    1
## 3 ID_68de51c94     NA      0     8      0    1      1    0    NA    0    0    0
## 4 ID_d671db89c 180000      0     5      0    1      1    1     1    0    2    2
## 5 ID_d56d6f5f5 180000      0     5      0    1      1    1     1    0    2    2
## 6 ID_ec05b1a7b 180000      0     5      0    1      1    1     1    0    2    2
##   r4m1 r4m2 r4m3 r4t1 r4t2 r4t3 tamhog tamviv escolari rez_esc hhsize
## 1    0    0    0    0    1    1      1      1       10      NA      1
## 2    0    0    0    0    1    1      1      1       12      NA      1
## 3    0    1    1    0    1    1      1      1       11      NA      1
## 4    1    1    2    1    3    4      4      4        9       1      4
## 5    1    1    2    1    3    4      4      4       11      NA      4
## 6    1    1    2    1    3    4      4      4       11      NA      4
##   paredblolad paredzocalo paredpreb pareddes paredmad paredzinc paredfibras
## 1           1           0         0        0        0         0           0
## 2           0           0         0        0        1         0           0
## 3           0           0         0        0        1         0           0
## 4           1           0         0        0        0         0           0
## 5           1           0         0        0        0         0           0
## 6           1           0         0        0        0         0           0
##   paredother pisomoscer pisocemento pisoother pisonatur pisonotiene pisomadera
## 1          0          1           0         0         0           0          0
## 2          0          0           0         0         0           0          1
## 3          0          1           0         0         0           0          0
## 4          0          1           0         0         0           0          0
## 5          0          1           0         0         0           0          0
## 6          0          1           0         0         0           0          0
##   techozinc techoentrepiso techocane techootro cielorazo abastaguadentro
## 1         0              1         0         0         1               1
## 2         1              0         0         0         1               1
## 3         1              0         0         0         1               1
## 4         1              0         0         0         1               1
## 5         1              0         0         0         1               1
## 6         1              0         0         0         1               1
##   abastaguafuera abastaguano public planpri noelec coopele sanitario1
## 1              0           0      1       0      0       0          0
## 2              0           0      1       0      0       0          0
## 3              0           0      1       0      0       0          0
## 4              0           0      1       0      0       0          0
## 5              0           0      1       0      0       0          0
## 6              0           0      1       0      0       0          0
##   sanitario2 sanitario3 sanitario5 sanitario6 energcocinar1 energcocinar2
## 1          1          0          0          0             0             0
## 2          1          0          0          0             0             1
## 3          1          0          0          0             0             1
## 4          1          0          0          0             0             1
## 5          1          0          0          0             0             1
## 6          1          0          0          0             0             1
##   energcocinar3 energcocinar4 elimbasu1 elimbasu2 elimbasu3 elimbasu4 elimbasu5
## 1             1             0         1         0         0         0         0
## 2             0             0         1         0         0         0         0
## 3             0             0         1         0         0         0         0
## 4             0             0         1         0         0         0         0
## 5             0             0         1         0         0         0         0
## 6             0             0         1         0         0         0         0
##   elimbasu6 epared1 epared2 epared3 etecho1 etecho2 etecho3 eviv1 eviv2 eviv3
## 1         0       0       1       0       1       0       0     1     0     0
## 2         0       0       1       0       0       1       0     0     1     0
## 3         0       0       1       0       0       0       1     0     0     1
## 4         0       0       0       1       0       0       1     0     0     1
## 5         0       0       0       1       0       0       1     0     0     1
## 6         0       0       0       1       0       0       1     0     0     1
##   dis male female estadocivil1 estadocivil2 estadocivil3 estadocivil4
## 1   0    1      0            0            0            0            1
## 2   0    1      0            0            0            0            1
## 3   1    0      1            0            0            0            0
## 4   0    1      0            0            0            0            0
## 5   0    0      1            0            1            0            0
## 6   0    1      0            0            1            0            0
##   estadocivil5 estadocivil6 estadocivil7 parentesco1 parentesco2 parentesco3
## 1            0            0            0           1           0           0
## 2            0            0            0           1           0           0
## 3            0            1            0           1           0           0
## 4            0            0            1           0           0           1
## 5            0            0            0           0           1           0
## 6            0            0            0           1           0           0
##   parentesco4 parentesco5 parentesco6 parentesco7 parentesco8 parentesco9
## 1           0           0           0           0           0           0
## 2           0           0           0           0           0           0
## 3           0           0           0           0           0           0
## 4           0           0           0           0           0           0
## 5           0           0           0           0           0           0
## 6           0           0           0           0           0           0
##   parentesco10 parentesco11 parentesco12   idhogar hogar_nin hogar_adul
## 1            0            0            0 21eb7fcc1         0          1
## 2            0            0            0 0e5d7a658         0          1
## 3            0            0            0 2c7317ea8         0          1
## 4            0            0            0 2b58d945f         2          2
## 5            0            0            0 2b58d945f         2          2
## 6            0            0            0 2b58d945f         2          2
##   hogar_mayor hogar_total dependency edjefe edjefa meaneduc instlevel1
## 1           0           1         no     10     no       10          0
## 2           1           1          8     12     no       12          0
## 3           1           1          8     no     11       11          0
## 4           0           4        yes     11     no       11          0
## 5           0           4        yes     11     no       11          0
## 6           0           4        yes     11     no       11          0
##   instlevel2 instlevel3 instlevel4 instlevel5 instlevel6 instlevel7 instlevel8
## 1          0          0          1          0          0          0          0
## 2          0          0          0          0          0          0          1
## 3          0          0          0          1          0          0          0
## 4          0          0          1          0          0          0          0
## 5          0          0          0          1          0          0          0
## 6          0          0          0          1          0          0          0
##   instlevel9 bedrooms overcrowding tipovivi1 tipovivi2 tipovivi3 tipovivi4
## 1          0        1     1.000000         0         0         1         0
## 2          0        1     1.000000         0         0         1         0
## 3          0        2     0.500000         1         0         0         0
## 4          0        3     1.333333         0         0         1         0
## 5          0        3     1.333333         0         0         1         0
## 6          0        3     1.333333         0         0         1         0
##   tipovivi5 computer television mobilephone qmobilephone lugar1 lugar2 lugar3
## 1         0        0          0           1            1      1      0      0
## 2         0        0          0           1            1      1      0      0
## 3         0        0          0           0            0      1      0      0
## 4         0        0          0           1            3      1      0      0
## 5         0        0          0           1            3      1      0      0
## 6         0        0          0           1            3      1      0      0
##   lugar4 lugar5 lugar6 area1 area2 age SQBescolari SQBage SQBhogar_total
## 1      0      0      0     1     0  43         100   1849              1
## 2      0      0      0     1     0  67         144   4489              1
## 3      0      0      0     1     0  92         121   8464              1
## 4      0      0      0     1     0  17          81    289             16
## 5      0      0      0     1     0  37         121   1369             16
## 6      0      0      0     1     0  38         121   1444             16
##   SQBedjefe SQBhogar_nin SQBovercrowding SQBdependency SQBmeaned agesq
## 1       100            0        1.000000             0       100  1849
## 2       144            0        1.000000            64       144  4489
## 3         0            0        0.250000            64       121  8464
## 4       121            4        1.777778             1       121   289
## 5       121            4        1.777778             1       121  1369
## 6       121            4        1.777778             1       121  1444
##   pared_int piso_int techo_int abasta_int sanitario_int energcocinar_int
## 1         1        3         2          1             2                3
## 2         4        2         4          1             2                2
## 3         4        3         4          1             2                2
## 4         1        3         4          1             2                2
## 5         1        3         4          1             2                2
## 6         1        3         4          1             2                2
##   elimbasu_int epared_int etecho_int eviv_int estadocivil_int parentesco_int
## 1            1          2          1        1               4              1
## 2            1          2          2        2               4              1
## 3            1          2          3        3               6              1
## 4            1          3          3        3               7              6
## 5            1          3          3        3               2              5
## 6            1          3          3        3               2              1
##   instlevel_int tipovivi_int lugar_int area_int
## 1             4            3         1        1
## 2             8            3         1        1
## 3             5            1         1        1
## 4             4            3         1        1
## 5             5            3         1        1
## 6             5            3         1        1

Introducing Ridge Encoding

# Grab OHE features

new_features_integer = data.frame(apply(new_features_integer,2,factor))
ridge_features = model.matrix(~.-1,new_features_integer[1:nrow(train),])
ridge_response = ifelse(train$Target == 4, 0, 1)


# Run ridge

cv.fit = cv.glmnet(
    x = as.matrix(ridge_features),
    y = as.factor(ridge_response),
    nfolds = 5,
    alpha = 0,
    family = "binomial"
)

plot(cv.fit)

### Final preprocessing

# Rather than taking the predictions themselves, we'll take the coeffecients as encoding

tmp_coeffs = coef(cv.fit, s = "lambda.min")
coefs = data.frame(name = tmp_coeffs@Dimnames[[1]][tmp_coeffs@i + 1], coefficient = tmp_coeffs@x)


# Lookup

# Adjust original feature level names

for (i in 1:ncol(new_features_integer)){
  new_features_integer[,i] = paste0(names(new_features_integer)[i], new_features_integer[,i])
}

new_features_ridge <- new_features_integer
new_features_ridge[] <- coefs$coefficient[match(unlist(new_features_integer), coefs$name)]

# Zeros can replace the NA since these are just base level

new_features_ridge[is.na(new_features_ridge)] = 0

# Rename for now ...

names(new_features_ridge) = gsub("int","ridge",names(new_features_ridge))

# Attach to data

full = data.frame(cbind(full,new_features_ridge), stringsAsFactors = FALSE)

Visualization

# Filter the data for those features that actually have some missing data

full_missing = full[, sapply(full, anyNA), drop = FALSE]

cat("Missing data found in ", ncol(full_missing)/ncol(full)*100, "% of features")
## Missing data found in  2.873563 % of features
# Plot missing information for these features

aggr(full_missing, prop = T, numbers = T, cex.axis = 0.8)
## Warning in plot.aggr(res, ...): not enough horizontal space to display
## frequencies

### Correlation

# Select numeric features (removing the categorical features for now and those with high % NA)

full_numerical = full %>% 
  select_if(is.numeric) %>% 
  select(-one_of(ohe_names)) %>% 
  select(
    -v2a1,
    -v18q1,
    -rez_esc
  )

# Since the number of NA is so low for the remaining features I will just replace them with their column mean

replace_na = function(x) replace(x, is.na(x), mean(x, na.rm = TRUE))
full_numerical = apply(full_numerical, 2, replace_na)

# Create the matrix

full_numerical_cor = cor(full_numerical, use = "complete.obs")

# Plot the matrix with some clustering

corrplot(full_numerical_cor, order = "hclust", tl.cex = 0.5)

# We have already created a numerical matrix with the categorical information excluded, so we will use this

full_pca = prcomp(full_numerical, center = TRUE, scale. = TRUE)

full = full %>% 
  cbind(full_pca$x[,1:3])

# scree plot

fviz_eig(full_pca)

# plot first two components
fviz_pca_var(full_pca,
             col.var = "contrib", # Color by contributions to the PC
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             select.var = list(contrib = 30), # top 30 contributing
             repel = TRUE     # Avoid text overlapping
             )
## Warning: ggrepel: 15 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

# Grab training set for visualisation

train_pca = full[1:nrow(train),] %>% 
  select(PC1,PC2,PC3) %>% 
  cbind(train$Target) %>% 
  rename(Target = `train$Target`) %>% 
  mutate(
    Target = as.factor(Target),
    Target_Binary = ifelse(Target == 4, "Non-vunerable", "Vunerable"))

# Plot

plot_ly(train_pca, x = ~PC1, y = ~PC2, z = ~PC3, color = ~Target) %>%
  add_markers(marker = list(
      size = 3,
      opacity = 0.5)) %>%
  layout(scene = list(xaxis = list(title = 'PC1'),
                     yaxis = list(title = 'PC2'),
                     zaxis = list(title = 'PC3'),
         title = 'PCA plot (original target)'))
plot_ly(train_pca, x = ~PC1, y = ~PC2, z = ~PC3, color = ~Target_Binary) %>%
  add_markers(marker = list(
      size = 3,
      opacity = 0.5)) %>%
  layout(scene = list(xaxis = list(title = 'PC1'),
                     yaxis = list(title = 'PC2'),
                     zaxis = list(title = 'PC3'),
         title = 'PCA plot (binary target)'))
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
# Run tsne on sample of numerical features we used for PCA (training set only)

train_tsne = full_numerical[1:nrow(train),] 

# Appears we have duplicates in our data - let's look

train_duplicates = which(duplicated(train_tsne))

# 23 instances: I suppose this could happen since we may have two adults in the same house with all numerical features being identical

# Remove dupes and scale

train_tsne_deduped = train_tsne[-train_duplicates,]
train_tsne_labels = train[-train_duplicates, "Target"]
train_tsne_labels = as.factor(train_tsne_labels$Target)


# Let's also include a binary version of our target to make visualisation easier

train_tsne_labels_binary = train[-train_duplicates, "Target"] %>% 
  mutate(
    Target_Binary = ifelse(Target == 4, "Non-vunerable", "Vunerable")
  )
train_tsne_labels_binary = as.factor(train_tsne_labels_binary$Target_Binary)

tsne = Rtsne(
  train_tsne_deduped, 
  dims = 3, 
  perplexity = 50, 
  max_iter = 500, 
  verbose = TRUE, 
  theta = 0.35,
  pca_scale = TRUE)
## Performing PCA
## Read the 9544 x 50 data matrix successfully!
## Using no_dims = 3, perplexity = 50.000000, and theta = 0.350000
## Computing input similarities...
## Building tree...
## Done in 20.41 seconds (sparsity = 0.024120)!
## Learning embedding...
## Iteration 50: error is 90.364804 (50 iterations in 11.41 seconds)
## Iteration 100: error is 90.346986 (50 iterations in 16.57 seconds)
## Iteration 150: error is 89.100517 (50 iterations in 17.90 seconds)
## Iteration 200: error is 88.892110 (50 iterations in 12.29 seconds)
## Iteration 250: error is 88.876462 (50 iterations in 11.34 seconds)
## Iteration 300: error is 2.950109 (50 iterations in 7.90 seconds)
## Iteration 350: error is 2.553811 (50 iterations in 8.32 seconds)
## Iteration 400: error is 2.344003 (50 iterations in 8.99 seconds)
## Iteration 450: error is 2.207216 (50 iterations in 8.06 seconds)
## Iteration 500: error is 2.109347 (50 iterations in 8.51 seconds)
## Fitting performed in 111.29 seconds.
# Plot with binary labels 

tsne_plot_binary <- data.frame(x = tsne$Y[,1], y = tsne$Y[,2], z = tsne$Y[,3], Target = train_tsne_labels_binary)

plot_ly(tsne_plot_binary, x =~x, y =~y, z =~z, color = ~Target) %>%
  add_markers(marker = list(
      size = 3,
      opacity = 0.5)) %>%
  layout(scene = list(xaxis = list(title = 'D1'),
                     yaxis = list(title = 'D2'),
                     zaxis = list(title = 'D3')),
         title = 't-SNE plot (binary target)')
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
# Select features

important_features = c(
  "escolari",
  "dependency",
  "age",
  "qmobilephone",
  "PC1",
  "PC2"
)

important = full[1:nrow(train),] %>% cbind(train$Target) %>%
  rename(Target = `train$Target`) %>% 
  select(one_of(important_features), Target) %>% 
  mutate(    
    dependency = as.numeric(ifelse(dependency == 'no', 0, 
                                   ifelse(dependency == 'yes', mean(dependency, na.rm = TRUE), dependency)))
    )
## Warning in mean.default(dependency, na.rm = TRUE): argument is not numeric or
## logical: returning NA
featurePlot(x=important[,1:6], y=as.factor(important[,7]),            
            plot = "box", 
            scales = list(y = list(relation="free"),
                          x = list(rot = 90)),  
            layout = c(3, 2), 
            auto.key = list(columns = 3))

featurePlot(x=important[,1:6], y=as.factor(important[,7]),            
            plot = "density", 
            scales = list(x = list(relation="free"), 
                          y = list(relation="free")), 
            adjust = 1.5, 
            pch = "|", 
            layout = c(3, 2), 
            auto.key = list(columns = 3))

# Add split label

train_test_data = full %>% select(one_of(important_features)) %>% 
    mutate(    
    dependency = as.numeric(ifelse(dependency == 'no', 0, 
                                   ifelse(dependency == 'yes', mean(dependency, na.rm = TRUE), dependency)))
    )
## Warning in mean.default(dependency, na.rm = TRUE): argument is not numeric or
## logical: returning NA
train_test_data[1:nrow(train),"Split"] = "Train"
train_test_data[(nrow(train)+1):nrow(train_test_data),"Split"] = "Test"


# Have a look at the different distributions


featurePlot(x=train_test_data[,important_features], y=as.factor(train_test_data[,"Split"]),            
            plot = "density", 
            scales = list(x = list(relation="free"), 
                          y = list(relation="free")), 
            adjust = 1.5, 
            pch = "|", 
            layout = c(3, 2), 
            auto.key = list(columns = 3))
## Warning in draw.key(simpleKey(...), draw = FALSE): not enough rows for columns

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.